################################################################################
### Replication Code: Data Cleaning
################################################################################
#
# Paper: Rationalizing Protest Participation  
#
# Authors: Tim Baule, Jonathan Bothner, Maximilian Kähny
#
# Software: R version 4.4.0 using Windows
#
################################################################################



# libraries
library(haven)   
library(dplyr)  

# working directory
rm(list = ls())
setwd("C:/Users/timba/OneDrive - Universität Bayreuth (1)/Uni/Research General/Protest Network Project/empirics/do files and skripts/final")

## 1. Load data ---------------------------------------------------------------
anes <- read_dta("anes_timeseries_2020_stata_20220210.dta")

## 2. Rename
anes <- anes %>% rename(state_id = V203000)

## 3. Merge (m:1) on state_id; keeponly longitude & latitude
state_coords <- read_dta("state_coordinates.dta") %>%
  select(state_id, longitude, latitude)

anes <- anes %>% left_join(state_coords, by = "state_id")

## 4. temp1/temp2 logic for zero-padded state_id ------------------------------
anes <- anes %>%
  mutate(
    temp1 = "",
    temp1 = if_else(state_id < 10, "0", temp1),
    temp2 = as.character(state_id)
  ) %>%
  select(-state_id) %>%
  mutate(state_id = paste0(temp1, temp2)) %>%
  select(-temp1, -temp2)


## 6. Protest participation ---------------------------------------------------
anes <- anes %>%
  mutate(
    protest = 0,
    protest = if_else(V202025 == 1, 1, protest),
    protest = if_else(V202025 < 0, NA_real_, protest)
  )

## 7. Network variables -------------------------------------------------------
anes <- anes %>%
  mutate(
    age = V201507x,
    age = if_else(age < 0, NA_real_, age),
    
    educ = V201510,
    educ = if_else(educ < 0 | educ == 95, NA_real_, educ),
    
    inc  = V201617x,
    inc  = if_else(inc < 0, NA_real_, inc),
    
    leftright = V202439,
    leftright = if_else(leftright < 0, NA_real_, leftright),
    leftright2 = abs(5 - leftright),
    leftright2 = if_else(leftright < 0, NA_real_, leftright2),
    
    
    reli = V201433,
    reli = if_else(reli < 0, NA_real_, reli),
    
    trust = 0,
    trust = if_else(V201237 == 4, 1, trust),
    trust = if_else(V201237 == 3, 2, trust),
    trust = if_else(V201237 == 2, 3, trust),
    trust = if_else(V201237 == 1, 4, trust),
    trust = if_else(V201237 < 0, NA_real_, trust)
  )

## 8. Individual variables ----------------------------------------------------
anes <- anes %>%
  mutate(
    # sex/gender
    male = 0,
    male = if_else(V201600 == 1, 1, male),
    male = if_else(V201600 == -9, NA_real_, male),
    
    # ethnicity
    black = 0,
    black = if_else(V201549x == 2, 1, black),
    black = if_else(V201549x %in% c(-8, -9), NA_real_, black),
    
    
    # health condition
    health = 0,
    health = if_else(V201623 == 4, 1, health),
    health = if_else(V201623 == 3, 2, health),
    health = if_else(V201623 == 2, 3, health),
    health = if_else(V201623 == 1, 4, health),
    health = if_else(V201623 == -9, NA_real_, health),
    
    
    # married
    married = 0,
    married = if_else(V201508 %in% c(1, 2), 1, married),
    married = if_else(V201508 %in% c(-9, -8), NA_real_, married),
    
    
    # children
    children = 0,
    children = if_else(V201567 == 1, 1, children),
    children = if_else(V201567 == 2, 2, children),
    children = if_else(V201567 == 3, 3, children),
    children = if_else(V201567 == 4, 4, children),
    children = if_else(V201567 == -9, NA_real_, children),
    
    
    # volunteer work
    volunteer = 0,
    volunteer = if_else(V202033 == 1, 1, volunteer),
    volunteer = if_else((V202033 == -7 | V202033 == -6), NA_real_, volunteer),
    
    
    # community service
    community_service = 0,
    community_service = if_else(V202032 == 1, 1, community_service),
    community_service = if_else((V202032 == -8 | V202032 == -9), NA_real_, community_service),

    
    # attend church
    attend_church = 0,
    attend_church = if_else(V201452 == 1, 1, attend_church),
    attend_church = if_else((V201452 == -8 | V201452 == -9), NA_real_, attend_church),

  )


# FE for census region
anes <- anes %>% 
  mutate(census_region = case_when(
    # Northeast
    state_id %in% c(23,33,50,25,44,9,36,42,34)    ~ "Northeast",
    # Midwest
    state_id %in% c(39,18,17,26,55,27,19,29,38,46,31,20) ~ "Midwest",
    # South
    state_id %in% c(10,24,11,51,54,37,45,13,12,21,47,1,28,5,22,40,48) ~ "South",
    # West (everything else)
    TRUE ~ "West"
  )) %>%
  # make it an explicit factor so you control the baseline
  mutate(census_region = factor(census_region,levels = c("Northeast","Midwest","South","West")))

# 2) turn into three dummies (use West as the omitted category)
anes <- anes %>% 
  mutate(
    dummy_NE     = as.integer(census_region=="Northeast"),
    dummy_MW     = as.integer(census_region=="Midwest"),
    dummy_South  = as.integer(census_region=="South")
  )

# create variable lists
other_covars <- c("male", "age", "black", "inc", "educ", "leftright", "leftright2", "health", "married", "children", "reli", "trust", "dummy_NE", "dummy_MW", "dummy_South")
state_id_var <- "state_id"
all_vars <- c("protest", "leftright", "leftright2", "reli", "trust","age", "educ", "inc", "longitude", "latitude", "black","male", "health", "married", "children", state_id_var)

# drop nas
cc <- complete.cases(anes[, all_vars])
anes_cc <- anes[cc, ]

anes_cc <- anes_cc %>%
  mutate(
    educ = as.numeric(educ),
    inc  = as.numeric(inc),
    reli = as.numeric(reli)
  )


## Save --------------------------------------------------------------------
write_dta(anes_cc, "anes_timeseries_2020_clean.dta")

